library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
theme_set(theme_light())
#install.packages('schrute')
library(schrute)
office_df <- theoffice
glimpse(office_df)
## Rows: 55,130
## Columns: 12
## $ index <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ season <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ episode <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ episode_name <chr> "Pilot", "Pilot", "Pilot", "Pilot", "Pilot", "Pilot",…
## $ director <chr> "Ken Kwapis", "Ken Kwapis", "Ken Kwapis", "Ken Kwapis…
## $ writer <chr> "Ricky Gervais;Stephen Merchant;Greg Daniels", "Ricky…
## $ character <chr> "Michael", "Jim", "Michael", "Jim", "Michael", "Micha…
## $ text <chr> "All right Jim. Your quarterlies look very good. How …
## $ text_w_direction <chr> "All right Jim. Your quarterlies look very good. How …
## $ imdb_rating <dbl> 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6…
## $ total_votes <int> 3706, 3706, 3706, 3706, 3706, 3706, 3706, 3706, 3706,…
## $ air_date <fct> 2005-03-24, 2005-03-24, 2005-03-24, 2005-03-24, 2005-…
office_df %>%
slice(1:5) %>%
knitr::kable()
| 1 |
1 |
1 |
Pilot |
Ken Kwapis |
Ricky Gervais;Stephen Merchant;Greg Daniels |
Michael |
All right Jim. Your quarterlies look very good. How are things at the library? |
All right Jim. Your quarterlies look very good. How are things at the library? |
7.6 |
3706 |
2005-03-24 |
| 2 |
1 |
1 |
Pilot |
Ken Kwapis |
Ricky Gervais;Stephen Merchant;Greg Daniels |
Jim |
Oh, I told you. I couldn’t close it. So… |
Oh, I told you. I couldn’t close it. So… |
7.6 |
3706 |
2005-03-24 |
| 3 |
1 |
1 |
Pilot |
Ken Kwapis |
Ricky Gervais;Stephen Merchant;Greg Daniels |
Michael |
So you’ve come to the master for guidance? Is this what you’re saying, grasshopper? |
So you’ve come to the master for guidance? Is this what you’re saying, grasshopper? |
7.6 |
3706 |
2005-03-24 |
| 4 |
1 |
1 |
Pilot |
Ken Kwapis |
Ricky Gervais;Stephen Merchant;Greg Daniels |
Jim |
Actually, you called me in here, but yeah. |
Actually, you called me in here, but yeah. |
7.6 |
3706 |
2005-03-24 |
| 5 |
1 |
1 |
Pilot |
Ken Kwapis |
Ricky Gervais;Stephen Merchant;Greg Daniels |
Michael |
All right. Well, let me show you how it’s done. |
All right. Well, let me show you how it’s done. |
7.6 |
3706 |
2005-03-24 |
office_df %>%
group_by(season) %>%
mutate(avgIMDB = mean(imdb_rating)) %>%
ggplot(aes(season, imdb_rating, group = season, color = factor(season))) +
geom_boxplot() +
theme(legend.position = 'none') +
labs(title = "Average IMDB rading by season",
x = "Season",
y = "AVG IMDB rating") +
scale_x_continuous(breaks = 1:9)

top_characters <- office_df %>%
count(character, sort = T) %>%
slice(1:20) %>%
mutate(character = fct_reorder(character, n))
top_characters %>%
ggplot(aes(n, character, fill = n)) +
geom_col() +
theme(legend.position = 'none')

office <- office_df %>%
select(season, episode, episode_name, imdb_rating, total_votes, air_date) %>%
mutate(air_date = as.Date(air_date)) %>%
distinct(., air_date, .keep_all = TRUE)
p <- office %>%
mutate(episode_num = row_number()) %>%
ggplot(aes(episode_num, imdb_rating, group = 1, color = factor(season))) +
geom_line() +
geom_smooth(se = FALSE) +
geom_point(aes(size = total_votes)) +
#geom_text(aes(label = episode_name), check_overlap = TRUE, hjust = 1) +
theme(legend.position = 'none') +
labs(title = 'Complete IMDB ratings for the office',
subtitle = 'point size ~ # of ratings',
x = 'Episode',
y = 'IMDB rating') +
ggplot2::aes(text = paste0('Episode name: ', episode_name,
'\nIMDB rating: ', imdb_rating,
'\ntotal votes: ', format(total_votes, big.mark = ','),
'\nseason: ', season)
)
plotly::ggplotly(p, tooltip = 'text')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
junk_words <- c('yeah', 'hey', 'uh', 'um')
names_blacklist <- top_characters %>%
select(character) %>%
as.character()
words_df <- office_df %>%
unnest_tokens('word', 'text') %>%
inner_join(top_characters %>%
select(character), by = 'character') %>%
anti_join(stop_words, by = 'word') %>%
filter(!word %in% junk_words,
!word %in% names_blacklist) %>%
count(character, word) %>%
group_by(character) %>%
slice_max(n, n = 8)
words_df %>%
slice(1:5) %>%
knitr::kable()
| Andy |
gonna |
130 |
| Andy |
guys |
118 |
| Andy |
time |
102 |
| Andy |
dwight |
80 |
| Andy |
erin |
78 |
| Angela |
dwight |
81 |
| Angela |
kevin |
40 |
| Angela |
god |
38 |
| Angela |
pam |
31 |
| Angela |
party |
31 |
| Darryl |
na |
48 |
| Darryl |
mike |
46 |
| Darryl |
gonna |
43 |
| Darryl |
cool |
23 |
| Darryl |
time |
23 |
| Dwight |
jim |
294 |
| Dwight |
michael |
282 |
| Dwight |
gonna |
150 |
| Dwight |
time |
140 |
| Dwight |
wait |
117 |
| Erin |
andy |
92 |
| Erin |
guys |
39 |
| Erin |
michael |
37 |
| Erin |
pam |
28 |
| Erin |
gabe |
26 |
| Gabe |
erin |
19 |
| Gabe |
gabe |
17 |
| Gabe |
gonna |
14 |
| Gabe |
guys |
13 |
| Gabe |
michael |
12 |
| Holly |
michael |
35 |
| Holly |
love |
22 |
| Holly |
gonna |
14 |
| Holly |
time |
13 |
| Holly |
talk |
11 |
| Jan |
michael |
159 |
| Jan |
branch |
17 |
| Jan |
gonna |
17 |
| Jan |
time |
16 |
| Jan |
pam |
14 |
| Jim |
dwight |
258 |
| Jim |
gonna |
199 |
| Jim |
pam |
189 |
| Jim |
michael |
181 |
| Jim |
alright |
133 |
| Kelly |
god |
50 |
| Kelly |
ryan |
44 |
| Kelly |
guys |
31 |
| Kelly |
gonna |
26 |
| Kelly |
michael |
26 |
| Kevin |
michael |
71 |
| Kevin |
jim |
39 |
| Kevin |
pam |
39 |
| Kevin |
oscar |
36 |
| Kevin |
guys |
33 |
| Meredith |
wait |
23 |
| Meredith |
gonna |
12 |
| Meredith |
minute |
11 |
| Meredith |
guys |
10 |
| Meredith |
meredith |
10 |
| Michael |
dwight |
353 |
| Michael |
people |
336 |
| Michael |
pam |
327 |
| Michael |
gonna |
287 |
| Michael |
time |
275 |
| Nellie |
dwight |
38 |
| Nellie |
andy |
17 |
| Nellie |
pam |
17 |
| Nellie |
gonna |
13 |
| Nellie |
jim |
13 |
| Oscar |
michael |
65 |
| Oscar |
angela |
37 |
| Oscar |
kevin |
34 |
| Oscar |
gay |
33 |
| Oscar |
andy |
29 |
| Pam |
michael |
262 |
| Pam |
jim |
182 |
| Pam |
dwight |
126 |
| Pam |
gonna |
116 |
| Pam |
time |
93 |
| Phyllis |
michael |
59 |
| Phyllis |
dwight |
34 |
| Phyllis |
andy |
26 |
| Phyllis |
bob |
24 |
| Phyllis |
time |
23 |
| Ryan |
michael |
43 |
| Ryan |
kelly |
41 |
| Ryan |
people |
25 |
| Ryan |
time |
25 |
| Ryan |
guys |
24 |
| Stanley |
michael |
27 |
| Stanley |
day |
15 |
| Stanley |
gonna |
14 |
| Stanley |
mind |
14 |
| Stanley |
christmas |
13 |
| Toby |
michael |
50 |
| Toby |
gonna |
26 |
| Toby |
talk |
26 |
| Toby |
guys |
19 |
| Toby |
time |
15 |
words_df %>%
mutate(word = reorder_within(word, n, character)) %>%
ggplot(aes(x = word, y = n, fill = character)) +
geom_col() +
scale_x_reordered() +
coord_flip() +
facet_wrap(~character, scales = "free") +
theme(legend.position = "none")

words_df %>%
bind_tf_idf(word, character, n) %>%
group_by(character) %>%
top_n(tf_idf, n = 5) %>%
ungroup() %>%
ggplot(aes(x = reorder_within(word, tf_idf, character), y = tf_idf, fill = character)) +
geom_col() +
scale_x_reordered() +
coord_flip() +
facet_wrap(~character, scales = "free") +
theme(legend.position = "none") +
labs(x = "",
y = "TF-IDF of character-word pairs")

office_df %>%
unnest_tokens('word', 'text') %>%
inner_join(top_characters %>%
select(character), by = 'character') %>%
anti_join(stop_words, by = 'word') %>%
filter(!word %in% junk_words,
!word %in% names_blacklist) %>%
select(season, word) %>%
inner_join(get_sentiments(lexicon = "bing")) %>%
group_by(season) %>%
count(sentiment) %>%
ungroup() %>%
ggplot(aes(x = season, y = n, fill = sentiment, group = sentiment)) +
geom_area() +
scale_x_continuous(breaks = 1:9) +
labs(title = 'Area plot of positive and negative sentiments over seasons')
## Joining, by = "word"
